global functions (convert key to dummy)

source("thesis_functions.R")

data :D

data <- fread("kpopdata.csv")
data <- mutate(data, ArtistType = as.factor(ArtistType),
               ArtistGender = as.factor(ArtistGender),
               ArtistGen = factor(ArtistGen),
               release_date = as.POSIXct(release_date, format = "%m/%d/%y"),
               key = factor(key, levels = 0:11),
               mode = as.factor(mode),
               time_signature = factor(time_signature, levels = c(1,3,4,5)),
               popular = factor(ifelse(popularity >=50, 1, 0)))

understanding popularity

hist(data$popularity)

summary(data$popularity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   11.00   23.00   25.88   39.00   94.00

heavily skewed right.

The square root transformation makes it more normal. This will help to meet the multiple linear regression assumptions.

hist(data$popularity^0.5)

Classification of

table(data$popular)/nrow(data)
## 
##         0         1 
## 0.8812021 0.1187979

if classifying a song as popular when it's score is greater than 50, only ~12% of the data is considered a popular song.

kpop <- dplyr::select(data, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, time_signature, tempo, valence)

General Assumptions: continuous response: popularity score ranging from 0 - 100. mix of categorical and continuous response. the distribution of the variables are not normal, we will check for normality of error terms where appropriate.

Goal: create model for predicting popularity scores

Multiple Linear Model?

select just audio features

kpop <- select(data, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
kpop0 <- kpop %>% filter(mode == 0)%>% select(-mode)
kpop1 <- kpop %>% filter(mode == 1) %>% select(-mode)

### Kpop mode 0 train and test
smpl.size0 <- floor(0.75*nrow(kpop0))
set.seed(123)
smpl0 <- sample(nrow(kpop0), smpl.size0, replace = FALSE)
train0 <- kpop0[smpl0,]
test0 <- kpop0[-smpl0,]

### Kpop mode 1 train and test
smpl.size1 <- floor(0.75*nrow(kpop1))
set.seed(123)
smpl1 <- sample(nrow(kpop1), smpl.size1, replace = FALSE)
train1 <- kpop1[smpl1,]
test1 <- kpop1[-smpl1,]

For Songs with Mode 0

Full Multiple Linear Regression

fit a multiple linear regression model

ml0 <- lm(popularity ~. , data = train0)
summary(ml0)
## 
## Call:
## lm(formula = popularity ~ ., data = train0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.283 -13.161  -1.606  11.489  64.381 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       98.43022    4.99248  19.716  < 2e-16 ***
## duration          -5.41104    0.55905  -9.679  < 2e-16 ***
## acousticness      -3.81930    2.05303  -1.860 0.062924 .  
## danceability      -2.19279    3.28543  -0.667 0.504544    
## energy           -33.97108    3.33505 -10.186  < 2e-16 ***
## instrumentalness -15.67494    4.37625  -3.582 0.000346 ***
## key1              -1.20599    1.49947  -0.804 0.421291    
## key2              -2.72991    1.85598  -1.471 0.141416    
## key3               3.04084    2.07581   1.465 0.143040    
## key4              -4.34609    1.46983  -2.957 0.003129 ** 
## key5              -0.56490    1.42695  -0.396 0.692217    
## key6               1.34681    1.48984   0.904 0.366062    
## key7               0.04123    1.63277   0.025 0.979856    
## key8              -1.57381    1.76361  -0.892 0.372251    
## key9              -0.94547    1.51224  -0.625 0.531872    
## key10             -1.75720    1.55635  -1.129 0.258955    
## key11             -2.41842    1.38950  -1.740 0.081861 .  
## loudness           3.55623    0.20271  17.543  < 2e-16 ***
## speechiness       23.77941    4.74755   5.009 5.75e-07 ***
## tempo             -0.00429    0.01255  -0.342 0.732398    
## valence          -11.88489    1.78863  -6.645 3.51e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.34 on 3483 degrees of freedom
## Multiple R-squared:  0.143,  Adjusted R-squared:  0.138 
## F-statistic: 29.05 on 20 and 3483 DF,  p-value: < 2.2e-16
plot(ml0)

no or little multicollinearity

no autocorrelation

no homoscedasticity.

Try again with tranformation to make popularity normal:(to the squareroot)

ml0.sqrt <- lm(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5))
summary(ml0.sqrt)
## 
## Call:
## lm(formula = popularity ~ ., data = train0 %>% mutate(popularity = popularity^0.5))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9651 -1.2163  0.1704  1.3715  5.2110 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      12.8026945  0.5397564  23.719  < 2e-16 ***
## duration         -0.5857021  0.0604408  -9.691  < 2e-16 ***
## acousticness     -0.1418036  0.2219612  -0.639 0.522952    
## danceability     -0.1381154  0.3552010  -0.389 0.697420    
## energy           -3.9085950  0.3605651 -10.840  < 2e-16 ***
## instrumentalness -2.2443174  0.4731338  -4.744 2.18e-06 ***
## key1             -0.1306867  0.1621132  -0.806 0.420214    
## key2             -0.3754834  0.2006575  -1.871 0.061392 .  
## key3              0.1823478  0.2244238   0.813 0.416551    
## key4             -0.5438271  0.1589089  -3.422 0.000628 ***
## key5             -0.1190871  0.1542733  -0.772 0.440212    
## key6              0.1383520  0.1610728   0.859 0.390433    
## key7             -0.0235720  0.1765251  -0.134 0.893779    
## key8             -0.1566102  0.1906708  -0.821 0.411495    
## key9             -0.1520276  0.1634941  -0.930 0.352505    
## key10            -0.2351475  0.1682632  -1.397 0.162353    
## key11            -0.2909129  0.1502246  -1.937 0.052885 .  
## loudness          0.4225379  0.0219162  19.280  < 2e-16 ***
## speechiness       2.1574574  0.5132766   4.203 2.70e-05 ***
## tempo            -0.0007701  0.0013563  -0.568 0.570207    
## valence          -1.1893401  0.1933758  -6.150 8.60e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.874 on 3483 degrees of freedom
## Multiple R-squared:  0.1594, Adjusted R-squared:  0.1546 
## F-statistic: 33.03 on 20 and 3483 DF,  p-value: < 2.2e-16

assumption checking and diagnostics

plot(ml0.sqrt)

prediction on test data

# prediction on test data
yhat.mlr = predict(ml0.sqrt, newdata = test0 %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.mlr <- yhat.mlr - test0$popularity^0.5
rmse.mlr <- sqrt(mean(error.mlr^2))
data.frame(
  RMSE = RMSE(yhat.mlr, test0$popularity^0.5),
  R2 = R2(yhat.mlr, test0$popularity^0.5)
)
##       RMSE        R2
## 1 1.863478 0.1332768

Much improved!

Variable Selection: Stepwise 10 fold cross validation

set.seed(123)
train_control <- trainControl(method = "cv", 
                              number = 10) 
bye <- capture.output(mlr.step_kcv <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5),  
                 method = "lmStepAIC", trControl = train_control)) 
print(mlr.step_kcv)
## Linear Regression with Stepwise Selection 
## 
## 3504 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 3153, 3152, 3154, 3154, 3154, 3154, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1.878512  0.1528544  1.516643
mlr.step_kcv$finalModel 
## 
## Call:
## lm(formula = .outcome ~ duration + energy + instrumentalness + 
##     key2 + key3 + key4 + key6 + key11 + loudness + speechiness + 
##     valence, data = dat)
## 
## Coefficients:
##      (Intercept)          duration            energy  instrumentalness  
##          12.4040           -0.5830           -3.8007           -2.2441  
##             key2              key3              key4              key6  
##          -0.2588            0.3053           -0.4241            0.2617  
##            key11          loudness       speechiness           valence  
##          -0.1689            0.4214            2.1000           -1.2328

prediction on test data

# prediction on test data
yhat.step_kcv = predict(mlr.step_kcv$finalModel, newdata=key.dummy(test0) %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.step_kcv <- yhat.step_kcv - test0$popularity^0.5
rmse.step_kcv <- sqrt(mean(error.step_kcv^2))
data.frame(
  RMSE = RMSE(yhat.step_kcv, test0$popularity^0.5),
  R2 = R2(yhat.step_kcv, test0$popularity^0.5)
)
##       RMSE        R2
## 1 1.861179 0.1350903

Both of the models have RMSE scores of around 2. This isn't terrible since the range of the transformed popularity scores is 0-10.

Regularized Regression: Ridge

lm.ridge0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.ridge0$bestTune
##    alpha lambda
## 47     0  0.047
# best coefficient
lm.ridge0.model <- coef(lm.ridge0$finalModel, lm.ridge0$bestTune$lambda)
lm.ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)      12.3411323068
## duration         -0.5762465119
## acousticness     -0.0662308488
## danceability     -0.1493986374
## energy           -3.5601398635
## instrumentalness -2.2691735701
## key1             -0.0831090967
## key2             -0.3241117452
## key3              0.2260133231
## key4             -0.4931505942
## key5             -0.0698858769
## key6              0.1837187046
## key7              0.0173666066
## key8             -0.1059775161
## key9             -0.1084064701
## key10            -0.1844827951
## key11            -0.2447391761
## loudness          0.3990852347
## speechiness       2.0404769494
## tempo            -0.0007694543
## valence          -1.1748730700
# prediction on test data
yhat.ridge0 = predict(lm.ridge0, s=lm.ridge0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
data.frame(
  RMSE = RMSE(yhat.ridge0, test0$popularity^0.5),
  R2 = R2(yhat.ridge0, test0$popularity^0.5)
)
##       RMSE        R2
## 1 1.861093 0.1339601

Regularized Regression: Lasso

lm.lasso0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.lasso0$bestTune
##    alpha lambda
## 17     1  0.017
# best coefficient
lm.lasso0.model <- coef(lm.lasso0$finalModel, lm.lasso0$bestTune$lambda)
lm.lasso0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                            1
## (Intercept)      11.92608192
## duration         -0.55317885
## acousticness      .         
## danceability      .         
## energy           -3.50918126
## instrumentalness -2.05824555
## key1              .         
## key2             -0.16147902
## key3              0.21698994
## key4             -0.36337170
## key5              .         
## key6              0.22354434
## key7              0.03593591
## key8              .         
## key9              .         
## key10            -0.04408683
## key11            -0.12086307
## loudness          0.39934685
## speechiness       1.77557012
## tempo             .         
## valence          -1.14005299
# prediction on test data
yhat.lasso0 = predict(lm.lasso0, s=lm.lasso0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.lasso0 <- yhat.lasso0 - test0$popularity^0.5
rmse.lasso0 <- sqrt(mean(error.lasso0^2))
data.frame(
  RMSE = RMSE(yhat.lasso0, test0$popularity^0.5),
  R2 = R2(yhat.lasso0, test0$popularity^0.5)
)
##       RMSE        R2
## 1 1.858953 0.1351553

Regularized Regression: Elastic Net

lm.enet0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lm.enet0$bestTune
# best coefficient
lm.enet0.model <- coef(lm.enet0$finalModel, lm.enet0$bestTune$lambda)
lm.enet0.model
# prediction on test data
yhat.enet0 = predict(lm.enet0, s=lm.enet0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.enet0 <- yhat.enet0 - test0$popularity^0.5
rmse.enet0 <- sqrt(mean(error.enet0^2))
data.frame(
  RMSE = RMSE(yhat.enet0, test0$popularity^0.5),
  R2 = R2(yhat.enet0, test0$popularity^0.5)
)

For songs with Mode 1

Full Multiple linear model

ml1.sqrt <- lm(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5))
summary(ml1.sqrt)
## 
## Call:
## lm(formula = popularity ~ ., data = train1 %>% mutate(popularity = popularity^0.5))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4223 -1.2067  0.1313  1.3255  5.2394 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.857177   0.401243  27.059  < 2e-16 ***
## duration         -0.554933   0.044683 -12.419  < 2e-16 ***
## acousticness      0.080560   0.148271   0.543 0.586927    
## danceability      0.050885   0.254514   0.200 0.841542    
## energy           -3.077791   0.280908 -10.957  < 2e-16 ***
## instrumentalness -1.877299   0.352964  -5.319 1.09e-07 ***
## key1              0.321294   0.091623   3.507 0.000457 ***
## key2              0.207166   0.098073   2.112 0.034699 *  
## key3              0.430613   0.150398   2.863 0.004210 ** 
## key4             -0.029116   0.127272  -0.229 0.819058    
## key5              0.204278   0.111709   1.829 0.067504 .  
## key6              0.216946   0.109298   1.985 0.047206 *  
## key7              0.283270   0.091145   3.108 0.001894 ** 
## key8              0.297098   0.105736   2.810 0.004975 ** 
## key9              0.270884   0.113464   2.387 0.017001 *  
## key10             0.080066   0.132171   0.606 0.544685    
## key11             0.389907   0.117678   3.313 0.000928 ***
## loudness          0.386256   0.017387  22.216  < 2e-16 ***
## speechiness       4.579814   0.414965  11.037  < 2e-16 ***
## tempo            -0.001474   0.000958  -1.538 0.124055    
## valence          -0.849286   0.154553  -5.495 4.08e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.821 on 5483 degrees of freedom
## Multiple R-squared:  0.1499, Adjusted R-squared:  0.1468 
## F-statistic: 48.35 on 20 and 5483 DF,  p-value: < 2.2e-16

prediction on test data

# prediction on test data
yhat.mlr = predict(ml1.sqrt, newdata = test1 %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.mlr <- yhat.mlr - test1$popularity^0.5
rmse.mlr <- sqrt(mean(error.mlr^2))
data.frame(
  RMSE = RMSE(yhat.mlr, test1$popularity^0.5),
  R2 = R2(yhat.mlr, test1$popularity^0.5)
)
##       RMSE        R2
## 1 1.858473 0.1257546

Variable Selection: Stepwise 10 fold cross validation

set.seed(123)
train_control <- trainControl(method = "cv", 
                              number = 10) 
bye <- capture.output(mlr.step_kcv <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5),  
                 method = "lmStepAIC", trControl = train_control)) 
print(mlr.step_kcv)
## Linear Regression with Stepwise Selection 
## 
## 5504 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4953, 4954, 4954, 4955, 4953, 4953, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1.823965  0.1452973  1.468792
mlr.step_kcv$finalModel 
## 
## Call:
## lm(formula = .outcome ~ duration + energy + instrumentalness + 
##     key1 + key2 + key3 + key5 + key6 + key7 + key8 + key9 + key11 + 
##     loudness + speechiness + tempo + valence, data = dat)
## 
## Coefficients:
##      (Intercept)          duration            energy  instrumentalness  
##        10.984803         -0.555451         -3.160519         -1.884462  
##             key1              key2              key3              key5  
##         0.311835          0.198954          0.425083          0.196319  
##             key6              key7              key8              key9  
##         0.209863          0.274552          0.288519          0.261604  
##            key11          loudness       speechiness             tempo  
##         0.383154          0.387173          4.578753         -0.001535  
##          valence  
##        -0.836556

prediction on test data

# prediction on test data
yhat.step_kcv = predict(mlr.step_kcv$finalModel, newdata=key.dummy(test1) %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.step_kcv <- yhat.step_kcv - test1$popularity^0.5
rmse.step_kcv <- sqrt(mean(error.step_kcv^2))
data.frame(
  RMSE = RMSE(yhat.step_kcv, test1$popularity^0.5),
  R2 = R2(yhat.step_kcv, test1$popularity^0.5)
)
##       RMSE        R2
## 1 1.858518 0.1257011

Regularized Regression: Ridge

lm.ridge1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.ridge1$bestTune
##    alpha lambda
## 45     0  0.045
# best coefficient
lm.ridge1.model <- coef(lm.ridge1$finalModel, lm.ridge1$bestTune$lambda)
lm.ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      10.347839763
## duration         -0.543796248
## acousticness      0.169898922
## danceability      0.049633951
## energy           -2.620990536
## instrumentalness -1.927744817
## key1              0.293618095
## key2              0.182685249
## key3              0.399431145
## key4             -0.049409156
## key5              0.179350934
## key6              0.188771865
## key7              0.259698734
## key8              0.273385099
## key9              0.249069582
## key10             0.051821098
## key11             0.361138772
## loudness          0.355806469
## speechiness       4.325244621
## tempo            -0.001381351
## valence          -0.858954968
# prediction on test data
yhat.ridge1 = predict(lm.ridge1, s=lm.ridge1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
data.frame(
  RMSE = RMSE(yhat.ridge1, test1$popularity^0.5),
  R2 = R2(yhat.ridge1, test1$popularity^0.5)
)
##       RMSE        R2
## 1 1.858182 0.1253763

Regularized Regression: Lasso

lm.lasso1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.lasso1$bestTune
##   alpha lambda
## 3     1  0.003
# best coefficient
lm.lasso1.model <- coef(lm.lasso1$finalModel, lm.lasso1$bestTune$lambda)
lm.lasso1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      10.823106069
## duration         -0.549904268
## acousticness      0.078797866
## danceability      0.007432375
## energy           -3.012459438
## instrumentalness -1.855427319
## key1              0.268514415
## key2              0.153367880
## key3              0.368124062
## key4             -0.060812240
## key5              0.148197878
## key6              0.160460653
## key7              0.231766196
## key8              0.242516531
## key9              0.215811156
## key10             0.020377237
## key11             0.334220735
## loudness          0.381054835
## speechiness       4.504126435
## tempo            -0.001394241
## valence          -0.829567095
# prediction on test data
yhat.lasso1 = predict(lm.lasso1, s=lm.lasso1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.lasso1 <- yhat.lasso1 - test1$popularity^0.5
rmse.lasso1 <- sqrt(mean(error.lasso1^2))
data.frame(
  RMSE = RMSE(yhat.lasso1, test1$popularity^0.5),
  R2 = R2(yhat.lasso1, test1$popularity^0.5)
)
##       RMSE        R2
## 1 1.858393 0.1255584

Regularized Regression: Elastic Net

lm.enet1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lm.enet1$bestTune
# best coefficient
lm.enet1.model <- coef(lm.enet1$finalModel, lm.enet1$bestTune$lambda)
lm.enet1.model
# prediction on test data
yhat.enet1 = predict(lm.enet1, s=lm.enet1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.enet1 <- yhat.enet1 - test1$popularity^0.5
rmse.enet1 <- sqrt(mean(error.enet1^2))
data.frame(
  RMSE = RMSE(yhat.enet1, test1$popularity^0.5),
  R2 = R2(yhat.enet1, test1$popularity^0.5)
)

Logistic (classification approach)

Use 70% train, 15% validation, 15% test, to use validation for finding optimal cutoff value.

kpop.logit <- select(data, popular, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
logit.kpop0 <- kpop.logit %>% filter(mode == 0)%>% select(-mode)
logit.kpop1 <- kpop.logit %>% filter(mode == 1) %>% select(-mode)

### Kpop mode 0 train and test
# smpl.size0 <- floor(0.75*nrow(logit.kpop0))
# set.seed(123)
# smpl0 <- sample(nrow(logit.kpop0), smpl.size0, replace = FALSE)
# og.logit.train0 <- logit.kpop0[smpl0,]
# og.logit.test0 <- logit.kpop0[-smpl0,]
set.seed(123)
p3 <- partition.3(logit.kpop0, 0.70, 0.15)
logit.train0 <- p3$data.train
logit.valid0 <- p3$data.val
logit.test0 <- p3$data.test
all.train0 <- rbind(logit.train0, logit.valid0)

# ### Kpop mode 1 train and test
# smpl.size1 <- floor(0.75*nrow(logit.kpop1))
# set.seed(123)
# smpl1 <- sample(nrow(logit.kpop1), smpl.size1, replace = FALSE)
# logit.train1 <- logit.kpop1[smpl1,]
# logit.test1 <- logit.kpop1[-smpl1,]
set.seed(123)
p3 <- partition.3(logit.kpop1, 0.70, 0.15)
logit.train1 <- p3$data.train
logit.valid1 <- p3$data.val
logit.test1 <- p3$data.test
all.train1 <- rbind(logit.train1, logit.valid1)

Mode 0 popularity

Full Logistic Model: train/valid/test

fitting logistic model using combo of train/valid/test, finding optimal model using training data.

# Fit logistic model on training data
v.logit.model0 <- glm(popular ~ ., family=binomial(link='logit'),data= logit.train0)

#search for best cutoff
out0 <- opt.cut.func(v.logit.model0, logit.valid0)
opt.cut.plot(out0)

out0$cutoff[which.min(out0$ssdiff.vec)]
## [1] 0.14
v.opt.cut0 <- out0$cutoff[which.max(out0$kappa.vec)]
v.opt.cut0
## [1] 0.18

Fit final model (combo of train and validation)

v.model.final <-  glm(popular ~ ., data=all.train0, family=binomial(link='logit'))

predict on test using 0.5 cutoff

v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > 0.5, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 606  93
##          1   1   1
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.8384, 0.8903)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 0.5275          
##                                           
##                   Kappa : 0.0153          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.010638        
##             Specificity : 0.998353        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.866953        
##              Prevalence : 0.134094        
##          Detection Rate : 0.001427        
##    Detection Prevalence : 0.002853        
##       Balanced Accuracy : 0.504495        
##                                           
##        'Positive' Class : 1               
## 

predict on optimal cutoff (0.18)

v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > v.opt.cut0, 1, 0) # using cutoff = 0.18
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 482  67
##          1 125  27
##                                           
##                Accuracy : 0.7261          
##                  95% CI : (0.6915, 0.7588)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0645          
##                                           
##  Mcnemar's Test P-Value : 3.895e-05       
##                                           
##             Sensitivity : 0.28723         
##             Specificity : 0.79407         
##          Pos Pred Value : 0.17763         
##          Neg Pred Value : 0.87796         
##              Prevalence : 0.13409         
##          Detection Rate : 0.03852         
##    Detection Prevalence : 0.21683         
##       Balanced Accuracy : 0.54065         
##                                           
##        'Positive' Class : 1               
## 

Predict on optimal balance between sensitivity and specificity (0.14)

v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > 0.14, 1, 0) # using cutoff = 0.14
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 383  42
##          1 224  52
##                                           
##                Accuracy : 0.6205          
##                  95% CI : (0.5835, 0.6566)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1013          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.55319         
##             Specificity : 0.63097         
##          Pos Pred Value : 0.18841         
##          Neg Pred Value : 0.90118         
##              Prevalence : 0.13409         
##          Detection Rate : 0.07418         
##    Detection Prevalence : 0.39372         
##       Balanced Accuracy : 0.59208         
##                                           
##        'Positive' Class : 1               
## 

Variable Selection: Stepwise (10 fold CV)

step-wise 10 fold cross validation

set.seed(123)
train_control <- trainControl(method = "cv", 
                              number = 10) 

# Fit K-fold CV model  
meh <- capture.output(step0_kcv <- train(popular ~ ., data = logit.train0, family = binomial(), 
                 method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step0_kcv <- step0_kcv$finalModel
step0_kcv
## 
## Call:  NULL
## 
## Coefficients:
##      (Intercept)          duration      acousticness            energy  
##           5.2007           -0.5129           -1.4480           -3.7363  
## instrumentalness              key3              key4          loudness  
##          -2.3933            0.7328           -0.3737            0.3399  
##      speechiness           valence  
##           3.1716           -1.4940  
## 
## Degrees of Freedom: 3270 Total (i.e. Null);  3261 Residual
## Null Deviance:       2609 
## Residual Deviance: 2453  AIC: 2473
kcv.prob.test0 <- predict(step0_kcv, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")

Find optimal cut off value:

#search for best cutoff
kcv.out0 <- opt.cut.func(step0_kcv, key.dummy(logit.valid0))
opt.cut.plot(kcv.out0)

kcv.out0$cutoff[which.min(kcv.out0$ssdiff.vec)]
## [1] 0.14
kcv.out0.cut0 <- kcv.out0$cutoff[which.max(kcv.out0$kappa.vec)]
kcv.out0.cut0
## [1] 0.17

Fit final model (combo of train and validation)

set.seed(123)
finalmeh <- capture.output(step0_kcv.final0 <- train(popular ~ ., data = all.train0, family = binomial(), 
                 method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step0_kcv.final0 <- step0_kcv.final0$finalModel
step0_kcv.final0
## 
## Call:  NULL
## 
## Coefficients:
##      (Intercept)          duration      acousticness            energy  
##           5.3767           -0.5256           -1.5256           -3.8603  
## instrumentalness              key3              key4          loudness  
##          -2.7390            0.6388           -0.4434            0.3518  
##      speechiness           valence  
##           3.3281           -1.5226  
## 
## Degrees of Freedom: 3971 Total (i.e. Null);  3962 Residual
## Null Deviance:       3102 
## Residual Deviance: 2908  AIC: 2928

predict on test using 0.5 cutoff

kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 606  93
##          1   1   1
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.8384, 0.8903)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 0.5275          
##                                           
##                   Kappa : 0.0153          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.010638        
##             Specificity : 0.998353        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.866953        
##              Prevalence : 0.134094        
##          Detection Rate : 0.001427        
##    Detection Prevalence : 0.002853        
##       Balanced Accuracy : 0.504495        
##                                           
##        'Positive' Class : 1               
## 

predict on test using optimal kappa, 0.17 cutoff

kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.17, 1, 0) # using cutoff = 0.17
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 475  65
##          1 132  29
##                                          
##                Accuracy : 0.719          
##                  95% CI : (0.6841, 0.752)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.07           
##                                          
##  Mcnemar's Test P-Value : 2.572e-06      
##                                          
##             Sensitivity : 0.30851        
##             Specificity : 0.78254        
##          Pos Pred Value : 0.18012        
##          Neg Pred Value : 0.87963        
##              Prevalence : 0.13409        
##          Detection Rate : 0.04137        
##    Detection Prevalence : 0.22967        
##       Balanced Accuracy : 0.54552        
##                                          
##        'Positive' Class : 1              
## 

predict on test using 0.14 cutoff, optimal balance between sensitivity and specificity

kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.14, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 397  45
##          1 210  49
##                                           
##                Accuracy : 0.6362          
##                  95% CI : (0.5994, 0.6719)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1007          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5213          
##             Specificity : 0.6540          
##          Pos Pred Value : 0.1892          
##          Neg Pred Value : 0.8982          
##              Prevalence : 0.1341          
##          Detection Rate : 0.0699          
##    Detection Prevalence : 0.3695          
##       Balanced Accuracy : 0.5877          
##                                           
##        'Positive' Class : 1               
## 

Variable Selection: All possible regression

set.seed(123)
glmulti.out0 <- glmulti(popular ~ ., data = logit.train0,
                           level = 1,               # No interaction considered
            method = "h",            # Exhaustive approach
            crit = "aic",            # AIC as criteria
            confsetsize = 5,         # Keep 5 best models
            plotty = F, report = F,  # No plot or interim reports
            fitfunction = "glm",     # lm function
            family = binomial)       # binomial to run logistic regression 
#glmulti.out0@formulas

view summary of top model

summary(glmulti.out0@objects[[1]])
## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3900  -0.5880  -0.4766  -0.3530   2.4475  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        5.2221     0.7349   7.106 1.19e-12 ***
## duration          -0.5083     0.1054  -4.822 1.42e-06 ***
## acousticness      -1.4563     0.3749  -3.885 0.000102 ***
## energy            -3.8200     0.5976  -6.393 1.63e-10 ***
## instrumentalness  -2.3856     1.8267  -1.306 0.191582    
## loudness           0.3438     0.0457   7.522 5.39e-14 ***
## speechiness        3.0901     0.7440   4.153 3.28e-05 ***
## valence           -1.4181     0.2775  -5.111 3.20e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2609.2  on 3270  degrees of freedom
## Residual deviance: 2465.1  on 3263  degrees of freedom
## AIC: 2481.1
## 
## Number of Fisher Scoring iterations: 6

Store model

allreg.logit0 <- glmulti.out0@objects[[1]]
#search for best cutoff
allreg.out0 <- opt.cut.func(allreg.logit0, logit.valid0)
opt.cut.plot(allreg.out0)

allreg.out0$cutoff[which.min(allreg.out0$ssdiff.vec)]
## [1] 0.15
allreg.out0.cut0 <- allreg.out0$cutoff[which.max(allreg.out0$kappa.vec)]
allreg.out0.cut0
## [1] 0.14

fit final model to combo of training and validation

set.seed(123)
glmulti.out0 <- glmulti(popular ~ ., data = all.train0,
                           level = 1,               # No interaction considered
            method = "h",            # Exhaustive approach
            crit = "aic",            # AIC as criteria
            confsetsize = 5,         # Keep 5 best models
            plotty = F, report = F,  # No plot or interim reports
            fitfunction = "glm",     # lm function
            family = binomial)       # binomial to run logistic regression 
#glmulti.out1@formulas
summary(glmulti.out0@objects[[1]])
## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4130  -0.5781  -0.4653  -0.3432   2.4901  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       5.36929    0.67847   7.914 2.50e-15 ***
## duration         -0.51557    0.09501  -5.426 5.75e-08 ***
## acousticness     -1.53724    0.34555  -4.449 8.64e-06 ***
## energy           -3.95482    0.55281  -7.154 8.43e-13 ***
## instrumentalness -2.72227    1.84991  -1.472    0.141    
## loudness          0.35437    0.04254   8.330  < 2e-16 ***
## speechiness       3.27806    0.67787   4.836 1.33e-06 ***
## valence          -1.44487    0.25585  -5.647 1.63e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3102.1  on 3971  degrees of freedom
## Residual deviance: 2922.2  on 3964  degrees of freedom
## AIC: 2938.2
## 
## Number of Fisher Scoring iterations: 6

store final model

allreg.logit0.final <- glmulti.out0@objects[[1]]

Predictions with 0.5 as the cut off

allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > 0.5, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 606  93
##          1   1   1
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.8384, 0.8903)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 0.5275          
##                                           
##                   Kappa : 0.0153          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.010638        
##             Specificity : 0.998353        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.866953        
##              Prevalence : 0.134094        
##          Detection Rate : 0.001427        
##    Detection Prevalence : 0.002853        
##       Balanced Accuracy : 0.504495        
##                                           
##        'Positive' Class : 1               
## 

Predictions where cut off yields the best kappa, 0.14

allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > allreg.out0.cut0, 1, 0) # using cutoff 0.14
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 404  42
##          1 203  52
##                                           
##                Accuracy : 0.6505          
##                  95% CI : (0.6139, 0.6858)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1269          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.55319         
##             Specificity : 0.66557         
##          Pos Pred Value : 0.20392         
##          Neg Pred Value : 0.90583         
##              Prevalence : 0.13409         
##          Detection Rate : 0.07418         
##    Detection Prevalence : 0.36377         
##       Balanced Accuracy : 0.60938         
##                                           
##        'Positive' Class : 1               
## 

Predictions where cut off is the best balance of sensitivity and specificity, 0.15

allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > 0.15, 1, 0) # using cutoff 0.15
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 435  51
##          1 172  43
##                                          
##                Accuracy : 0.6819         
##                  95% CI : (0.646, 0.7162)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1128         
##                                          
##  Mcnemar's Test P-Value : 9.297e-16      
##                                          
##             Sensitivity : 0.45745        
##             Specificity : 0.71664        
##          Pos Pred Value : 0.20000        
##          Neg Pred Value : 0.89506        
##              Prevalence : 0.13409        
##          Detection Rate : 0.06134        
##    Detection Prevalence : 0.30670        
##       Balanced Accuracy : 0.58704        
##                                          
##        'Positive' Class : 1              
## 

Regularized Regression: Ridge 10 fold Cross Validation

set.seed(123)
ridge0 <- train(popular ~ ., data = logit.train0, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge0$bestTune
##     alpha lambda
## 100     0    0.1
ridge0.model <- coef(ridge0$finalModel, ridge0$bestTune$lambda)
ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)       0.3812536494
## duration         -0.2559119522
## acousticness     -0.3997989337
## danceability     -0.1441080290
## energy           -0.6313909327
## instrumentalness -0.8210624307
## key1             -0.0140152818
## key2             -0.1557323229
## key3              0.4654206935
## key4             -0.1797053979
## key5              0.0291019833
## key6              0.0724378741
## key7              0.0623824309
## key8             -0.0095258891
## key9              0.0216445665
## key10             0.0301392242
## key11            -0.0311864381
## loudness          0.0852587410
## speechiness       1.4727602887
## tempo             0.0001278062
## valence          -0.7066431211

Search for best cutoff using validation set

ridge0.out <- reg.opt.cut.func(ridge0, logit.valid0)
opt.cut.plot(ridge0.out)

# cut off by kappa
ridge0.out$cutoff[which.max(ridge0.out$kappa.vec)]
## [1] 0.15
ridge0.out$cutoff[which.min(ridge0.out$ssdiff.vec)]
## [1] 0.14

create final model

set.seed(123)
ridge0 <- train(popular ~ ., data = all.train0, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge0$bestTune
##     alpha lambda
## 100     0    0.1
ridge0.model <- coef(ridge0$finalModel, ridge0$bestTune$lambda)
ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)       0.321762194
## duration         -0.257359810
## acousticness     -0.409439651
## danceability     -0.062528153
## energy           -0.607499947
## instrumentalness -0.839357369
## key1              0.013386505
## key2             -0.091231783
## key3              0.400527208
## key4             -0.198298922
## key5              0.053748073
## key6              0.142443327
## key7              0.036589828
## key8              0.030302380
## key9             -0.041870877
## key10             0.034094167
## key11            -0.034462056
## loudness          0.084161023
## speechiness       1.615890923
## tempo            -0.000348579
## valence          -0.724448093

predict and evaluate on the test set where cutoff is at 0.5

prob.ridge0 <- predict(ridge0, s = ridge0$bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular), 
                positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.ridge0),
## as.factor(logit.test0$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 607  94
##          1   0   0
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.8384, 0.8903)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 0.5275          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.8659          
##              Prevalence : 0.1341          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.15 corresponding to optimal kappa

prob.ridg0 <- predict(ridge0, s = ridge0bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.15, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 464  63
##          1 143  31
##                                           
##                Accuracy : 0.7061          
##                  95% CI : (0.6709, 0.7396)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0693          
##                                           
##  Mcnemar's Test P-Value : 3.709e-08       
##                                           
##             Sensitivity : 0.32979         
##             Specificity : 0.76442         
##          Pos Pred Value : 0.17816         
##          Neg Pred Value : 0.88046         
##              Prevalence : 0.13409         
##          Detection Rate : 0.04422         
##    Detection Prevalence : 0.24822         
##       Balanced Accuracy : 0.54710         
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.14 corresponding to optimal sensitivity and specificity balance

prob.ridg0 <- predict(ridge0, s = ridge0bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.14, 1, 0) # using cutoff = 0.14
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 403  48
##          1 204  46
##                                           
##                Accuracy : 0.6405          
##                  95% CI : (0.6037, 0.6761)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0901          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.48936         
##             Specificity : 0.66392         
##          Pos Pred Value : 0.18400         
##          Neg Pred Value : 0.89357         
##              Prevalence : 0.13409         
##          Detection Rate : 0.06562         
##    Detection Prevalence : 0.35663         
##       Balanced Accuracy : 0.57664         
##                                           
##        'Positive' Class : 1               
## 

Regularized Regression: Elastic Net 10 fold Cross Validation

set.seed(123)
enet0 <- train(popular ~ ., data = logit.train0, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet0$bestTune
##     alpha lambda
## 100     0    0.1
# best coefficient
enet0.model <- coef(enet0$finalModel, enet0$bestTune$lambda)
enet0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)       0.3812536494
## duration         -0.2559119522
## acousticness     -0.3997989337
## danceability     -0.1441080290
## energy           -0.6313909327
## instrumentalness -0.8210624307
## key1             -0.0140152818
## key2             -0.1557323229
## key3              0.4654206935
## key4             -0.1797053979
## key5              0.0291019833
## key6              0.0724378741
## key7              0.0623824309
## key8             -0.0095258891
## key9              0.0216445665
## key10             0.0301392242
## key11            -0.0311864381
## loudness          0.0852587410
## speechiness       1.4727602887
## tempo             0.0001278062
## valence          -0.7066431211

search for best cutoff with validation set

enet0.out <- reg.opt.cut.func(enet0, logit.valid0)
opt.cut.plot(enet0.out)

# cut off by kappa
enet0.out$cutoff[which.max(enet0.out$kappa.vec)]
## [1] 0.15
enet0.out$cutoff[which.min(enet0.out$ssdiff.vec)]
## [1] 0.14

create final model on train and validation

set.seed(123)
enet0 <- train(popular ~ ., data = all.train0, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet0$bestTune
##     alpha lambda
## 100     0    0.1
# best coefficient
enet0.model <- coef(enet0$finalModel, enet0$bestTune$lambda)
enet0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)       0.321762194
## duration         -0.257359810
## acousticness     -0.409439651
## danceability     -0.062528153
## energy           -0.607499947
## instrumentalness -0.839357369
## key1              0.013386505
## key2             -0.091231783
## key3              0.400527208
## key4             -0.198298922
## key5              0.053748073
## key6              0.142443327
## key7              0.036589828
## key8              0.030302380
## key9             -0.041870877
## key10             0.034094167
## key11            -0.034462056
## loudness          0.084161023
## speechiness       1.615890923
## tempo            -0.000348579
## valence          -0.724448093

predict and evaluate on the test set where cutoff is at 0.5

prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular), 
                positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.enet0),
## as.factor(logit.test0$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 607  94
##          1   0   0
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.8384, 0.8903)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 0.5275          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.8659          
##              Prevalence : 0.1341          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.15 corresponding to optimal kappa

prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.15, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 464  63
##          1 143  31
##                                           
##                Accuracy : 0.7061          
##                  95% CI : (0.6709, 0.7396)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0693          
##                                           
##  Mcnemar's Test P-Value : 3.709e-08       
##                                           
##             Sensitivity : 0.32979         
##             Specificity : 0.76442         
##          Pos Pred Value : 0.17816         
##          Neg Pred Value : 0.88046         
##              Prevalence : 0.13409         
##          Detection Rate : 0.04422         
##    Detection Prevalence : 0.24822         
##       Balanced Accuracy : 0.54710         
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.14 corresponding to optimal balance of sensitivity and specificity

prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.14, 1, 0) # using cutoff = 0.14
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 403  48
##          1 204  46
##                                           
##                Accuracy : 0.6405          
##                  95% CI : (0.6037, 0.6761)
##     No Information Rate : 0.8659          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0901          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.48936         
##             Specificity : 0.66392         
##          Pos Pred Value : 0.18400         
##          Neg Pred Value : 0.89357         
##              Prevalence : 0.13409         
##          Detection Rate : 0.06562         
##    Detection Prevalence : 0.35663         
##       Balanced Accuracy : 0.57664         
##                                           
##        'Positive' Class : 1               
## 

Mode 1 popularity

Full Logistic Model: train/valid/test

fitting logistic model using combo of train/valid/test, finding optimal model using training data.

# Fit logistic model on training data
v.logit.model1 <- glm(popular ~ ., family=binomial(link='logit'),data= logit.train1)

#search for best cutoff
out1 <- opt.cut.func(v.logit.model1, logit.valid1)
opt.cut.plot(out1)

out1$cutoff[which.min(out1$ssdiff.vec)]
## [1] 0.11
v.opt.cut1 <- out1$cutoff[which.max(out1$kappa.vec)]
v.opt.cut1
## [1] 0.15

Fit final model (combo of train and validation)

v.model.final1 <-  glm(popular ~ ., data=all.train1, family=binomial(link='logit'))

predict on test using 0.5 cutoff

v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > 0.5, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 987 113
##          1   1   0
##                                           
##                Accuracy : 0.8965          
##                  95% CI : (0.8769, 0.9138)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 0.5643          
##                                           
##                   Kappa : -0.0018         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000000       
##             Specificity : 0.9989879       
##          Pos Pred Value : 0.0000000       
##          Neg Pred Value : 0.8972727       
##              Prevalence : 0.1026340       
##          Detection Rate : 0.0000000       
##    Detection Prevalence : 0.0009083       
##       Balanced Accuracy : 0.4994939       
##                                           
##        'Positive' Class : 1               
## 

predict on test using optimal cutoff (kappa), 0.15

v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > v.opt.cut1, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 817  72
##          1 171  41
##                                           
##                Accuracy : 0.7793          
##                  95% CI : (0.7536, 0.8035)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1367          
##                                           
##  Mcnemar's Test P-Value : 3.243e-10       
##                                           
##             Sensitivity : 0.36283         
##             Specificity : 0.82692         
##          Pos Pred Value : 0.19340         
##          Neg Pred Value : 0.91901         
##              Prevalence : 0.10263         
##          Detection Rate : 0.03724         
##    Detection Prevalence : 0.19255         
##       Balanced Accuracy : 0.59488         
##                                           
##        'Positive' Class : 1               
## 

predict on test using optimal cutoff (sensitivity and specificity), 0.11

v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > 0.11, 1, 0) # using cutoff = 0.11
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 609  43
##          1 379  70
##                                           
##                Accuracy : 0.6167          
##                  95% CI : (0.5873, 0.6455)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1018          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.61947         
##             Specificity : 0.61640         
##          Pos Pred Value : 0.15590         
##          Neg Pred Value : 0.93405         
##              Prevalence : 0.10263         
##          Detection Rate : 0.06358         
##    Detection Prevalence : 0.40781         
##       Balanced Accuracy : 0.61793         
##                                           
##        'Positive' Class : 1               
## 

Variable Selection: Stepwise (10 fold CV)

step-wise 10 fold cross validation

set.seed(123)
train_control <- trainControl(method = "cv", 
                              number = 10) 

# Fit K-fold CV model  
meh <- capture.output(step1_kcv <- train(popular ~ ., data = logit.train1, family = binomial(), 
                 method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step1_kcv <- step1_kcv$finalModel
step1_kcv
## 
## Call:  NULL
## 
## Coefficients:
##      (Intercept)          duration      acousticness            energy  
##           4.0735           -0.4712           -0.8449           -3.3814  
## instrumentalness              key3              key4             key10  
##         -13.0994            0.6327           -0.5020           -0.3727  
##         loudness       speechiness           valence  
##           0.3220            5.0849           -1.2362  
## 
## Degrees of Freedom: 5136 Total (i.e. Null);  5126 Residual
## Null Deviance:       3568 
## Residual Deviance: 3357  AIC: 3379
#search for best cutoff
kcv.out1 <- opt.cut.func(step1_kcv, key.dummy(logit.valid1))
opt.cut.plot(kcv.out1)

kcv.out1$cutoff[which.min(kcv.out1$ssdiff.vec)]
## [1] 0.11
kcv.out1.cut1 <- kcv.out1$cutoff[which.max(kcv.out1$kappa.vec)]
kcv.out1.cut1
## [1] 0.13

Fit final model (combo of train and validation)

set.seed(123)
finalmeh <- capture.output(step1_kcv.final <- train(popular ~ ., data = all.train1, family = binomial(), 
                 method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step1_kcv.final <- step1_kcv.final$finalModel
step1_kcv.final
## 
## Call:  NULL
## 
## Coefficients:
##      (Intercept)          duration      acousticness      danceability  
##         3.279565         -0.464924         -0.777317          0.976056  
##           energy  instrumentalness              key3              key8  
##        -3.477833        -12.106204          0.512832          0.267298  
##            key10          loudness       speechiness             tempo  
##        -0.459592          0.333033          4.790242          0.002819  
##          valence  
##        -1.411113  
## 
## Degrees of Freedom: 6237 Total (i.e. Null);  6225 Residual
## Null Deviance:       4360 
## Residual Deviance: 4095  AIC: 4121

predict on test using 0.5 cutoff

kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 988 112
##          1   0   1
##                                           
##                Accuracy : 0.8983          
##                  95% CI : (0.8789, 0.9155)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 0.4854          
##                                           
##                   Kappa : 0.0158          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0088496       
##             Specificity : 1.0000000       
##          Pos Pred Value : 1.0000000       
##          Neg Pred Value : 0.8981818       
##              Prevalence : 0.1026340       
##          Detection Rate : 0.0009083       
##    Detection Prevalence : 0.0009083       
##       Balanced Accuracy : 0.5044248       
##                                           
##        'Positive' Class : 1               
## 

Using cutoff 0.13 which yields optimal kappa

kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > kcv.out1.cut1, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 723  56
##          1 265  57
##                                           
##                Accuracy : 0.7084          
##                  95% CI : (0.6806, 0.7352)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1299          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.50442         
##             Specificity : 0.73178         
##          Pos Pred Value : 0.17702         
##          Neg Pred Value : 0.92811         
##              Prevalence : 0.10263         
##          Detection Rate : 0.05177         
##    Detection Prevalence : 0.29246         
##       Balanced Accuracy : 0.61810         
##                                           
##        'Positive' Class : 1               
## 

Using cut off 0.11 which yields the best balance between sensitivity and specificity.

kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > 0.11, 1, 0) # using cutoff = 0.11
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 594  45
##          1 394  68
##                                           
##                Accuracy : 0.6013          
##                  95% CI : (0.5717, 0.6303)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0857          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.60177         
##             Specificity : 0.60121         
##          Pos Pred Value : 0.14719         
##          Neg Pred Value : 0.92958         
##              Prevalence : 0.10263         
##          Detection Rate : 0.06176         
##    Detection Prevalence : 0.41962         
##       Balanced Accuracy : 0.60149         
##                                           
##        'Positive' Class : 1               
## 

Variable Selection: All possible regression

set.seed(123)
glmulti.out1 <- glmulti(popular ~ ., data = logit.train1,
                           level = 1,               # No interaction considered
            method = "h",            # Exhaustive approach
            crit = "aic",            # AIC as criteria
            confsetsize = 5,         # Keep 5 best models
            plotty = F, report = F,  # No plot or interim reports
            fitfunction = "glm",     # lm function
            family = binomial)       # binomial to run logistic regression 
#glmulti.out1@formulas

view summary of top model

summary(glmulti.out1@objects[[1]])
## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8240  -0.5209  -0.4300  -0.3327   2.7606  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.01931    0.62836   6.397 1.59e-10 ***
## duration          -0.46099    0.08964  -5.143 2.71e-07 ***
## acousticness      -0.83197    0.27043  -3.077  0.00209 ** 
## energy            -3.38962    0.52995  -6.396 1.59e-10 ***
## instrumentalness -13.17589    9.02825  -1.459  0.14445    
## loudness           0.32058    0.03853   8.321  < 2e-16 ***
## speechiness        5.09477    0.63162   8.066 7.25e-16 ***
## valence           -1.22718    0.24729  -4.963 6.96e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3568.2  on 5136  degrees of freedom
## Residual deviance: 3372.1  on 5129  degrees of freedom
## AIC: 3388.1
## 
## Number of Fisher Scoring iterations: 9

Store model

allreg.logit1 <- glmulti.out1@objects[[1]]
#search for best cutoff
allreg.out1 <- opt.cut.func(allreg.logit1, logit.valid1)
opt.cut.plot(allreg.out1)

allreg.out1$cutoff[which.min(allreg.out1$ssdiff.vec)]
## [1] 0.11
allreg.out1.cut1 <- allreg.out1$cutoff[which.max(allreg.out1$kappa.vec)]
allreg.out1.cut1
## [1] 0.16

fit final model to combo of training and validation

set.seed(123)
glmulti.out1 <- glmulti(popular ~ ., data = all.train1,
                           level = 1,               # No interaction considered
            method = "h",            # Exhaustive approach
            crit = "aic",            # AIC as criteria
            confsetsize = 5,         # Keep 5 best models
            plotty = F, report = F,  # No plot or interim reports
            fitfunction = "glm",     # lm function
            family = binomial)       # binomial to run logistic regression 
#glmulti.out1@formulas
summary(glmulti.out1@objects[[1]])
## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6717  -0.5243  -0.4296  -0.3299   2.7251  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        3.289296   0.692110   4.753 2.01e-06 ***
## duration          -0.460833   0.081938  -5.624 1.86e-08 ***
## acousticness      -0.750350   0.249223  -3.011  0.00261 ** 
## danceability       0.942373   0.420840   2.239  0.02514 *  
## energy            -3.483207   0.478705  -7.276 3.43e-13 ***
## instrumentalness -11.929151   7.266196  -1.642  0.10065    
## loudness           0.333199   0.035260   9.450  < 2e-16 ***
## speechiness        4.822191   0.576504   8.365  < 2e-16 ***
## tempo              0.002978   0.001572   1.894  0.05822 .  
## valence           -1.405615   0.249178  -5.641 1.69e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4359.9  on 6237  degrees of freedom
## Residual deviance: 4107.6  on 6228  degrees of freedom
## AIC: 4127.6
## 
## Number of Fisher Scoring iterations: 9

store final model

allreg.logit1.final <- glmulti.out1@objects[[1]]

Predictions with 0.5 as the cut off

allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > 0.5, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 988 112
##          1   0   1
##                                           
##                Accuracy : 0.8983          
##                  95% CI : (0.8789, 0.9155)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 0.4854          
##                                           
##                   Kappa : 0.0158          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0088496       
##             Specificity : 1.0000000       
##          Pos Pred Value : 1.0000000       
##          Neg Pred Value : 0.8981818       
##              Prevalence : 0.1026340       
##          Detection Rate : 0.0009083       
##    Detection Prevalence : 0.0009083       
##       Balanced Accuracy : 0.5044248       
##                                           
##        'Positive' Class : 1               
## 

Predictions where cut off is the best kappa, 0.16

allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > allreg.out1.cut1, 1, 0) # using optimal cutoff 0.16
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 841  77
##          1 147  36
##                                         
##                Accuracy : 0.7965        
##                  95% CI : (0.7715, 0.82)
##     No Information Rate : 0.8974        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.1332        
##                                         
##  Mcnemar's Test P-Value : 4.022e-06     
##                                         
##             Sensitivity : 0.3186        
##             Specificity : 0.8512        
##          Pos Pred Value : 0.1967        
##          Neg Pred Value : 0.9161        
##              Prevalence : 0.1026        
##          Detection Rate : 0.0327        
##    Detection Prevalence : 0.1662        
##       Balanced Accuracy : 0.5849        
##                                         
##        'Positive' Class : 1             
## 

Predictions where cut off is the best balance of sensitivity and specificity, 0.11

allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > 0.11, 1, 0) # using optimal cutoff  0.11
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 594  44
##          1 394  69
##                                           
##                Accuracy : 0.6022          
##                  95% CI : (0.5726, 0.6312)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0893          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.61062         
##             Specificity : 0.60121         
##          Pos Pred Value : 0.14903         
##          Neg Pred Value : 0.93103         
##              Prevalence : 0.10263         
##          Detection Rate : 0.06267         
##    Detection Prevalence : 0.42053         
##       Balanced Accuracy : 0.60592         
##                                           
##        'Positive' Class : 1               
## 

Regularized Regression: Ridge 10 fold Cross Validation

set.seed(123)
ridge1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge1$bestTune
##     alpha lambda
## 100     0    0.1
ridge1.model <- coef(ridge1$finalModel, ridge1$bestTune$lambda)
ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      -0.940884985
## duration         -0.198157468
## acousticness     -0.107972712
## danceability     -0.008980677
## energy           -0.260064046
## instrumentalness -0.827713363
## key1              0.046091952
## key2              0.008592692
## key3              0.349398913
## key4             -0.189817749
## key5              0.003945898
## key6             -0.027278685
## key7              0.091414861
## key8              0.090088954
## key9              0.089685627
## key10            -0.163351622
## key11            -0.024566167
## loudness          0.062462753
## speechiness       2.309752435
## tempo             0.001341193
## valence          -0.487758435

Search for best cutoff using validation set

ridge1.out <- reg.opt.cut.func(ridge1, logit.valid1)
opt.cut.plot(ridge1.out)

# cut off by kappa
ridge1.out$cutoff[which.max(ridge1.out$kappa.vec)]
## [1] 0.12
ridge1.out$cutoff[which.min(ridge1.out$ssdiff.vec)]
## [1] 0.11

create final model

set.seed(123)
ridge1 <- train(popular ~ ., data = all.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge1$bestTune
##     alpha lambda
## 100     0    0.1
ridge1.model <- coef(ridge1$finalModel, ridge1$bestTune$lambda)
ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      -1.031119627
## duration         -0.202122239
## acousticness     -0.103141402
## danceability      0.159103462
## energy           -0.270068249
## instrumentalness -0.817245806
## key1              0.026882836
## key2              0.007008190
## key3              0.266050136
## key4             -0.114394579
## key5             -0.025139220
## key6             -0.032287416
## key7              0.078237033
## key8              0.138366721
## key9              0.130311371
## key10            -0.214434940
## key11            -0.063822095
## loudness          0.064513136
## speechiness       2.177666732
## tempo             0.001583618
## valence          -0.474359450

predict and evaluate on the test set where cutoff is at 0.5

prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular), 
                positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.ridge1),
## as.factor(logit.test1$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 988 113
##          1   0   0
##                                           
##                Accuracy : 0.8974          
##                  95% CI : (0.8779, 0.9147)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 0.525           
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.8974          
##              Prevalence : 0.1026          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.12 corresponding to optimal kappa

prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.12, 1, 0) # using cutoff = 0.12
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 703  60
##          1 285  53
##                                          
##                Accuracy : 0.6866         
##                  95% CI : (0.6583, 0.714)
##     No Information Rate : 0.8974         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.096          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.46903        
##             Specificity : 0.71154        
##          Pos Pred Value : 0.15680        
##          Neg Pred Value : 0.92136        
##              Prevalence : 0.10263        
##          Detection Rate : 0.04814        
##    Detection Prevalence : 0.30699        
##       Balanced Accuracy : 0.59028        
##                                          
##        'Positive' Class : 1              
## 

predict and evaluate on the test set where cutoff is at 0.11 corresponding to optimal balance of sensitivity and specificity

prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.11, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 557  36
##          1 431  77
##                                          
##                Accuracy : 0.5758         
##                  95% CI : (0.546, 0.6053)
##     No Information Rate : 0.8974         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0962         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.68142        
##             Specificity : 0.56377        
##          Pos Pred Value : 0.15157        
##          Neg Pred Value : 0.93929        
##              Prevalence : 0.10263        
##          Detection Rate : 0.06994        
##    Detection Prevalence : 0.46140        
##       Balanced Accuracy : 0.62259        
##                                          
##        'Positive' Class : 1              
## 

Regularized Regression: Lasso 10 fold Cross Validation (not kosher, returns with a model of just the intercept)

lasso1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lasso1$bestTune
# best coefficient
lasso1.model <- coef(lasso1$finalModel, lasso1$bestTune$lambda)
lasso1.model

Search for best cutoff using validation set

lasso1.out <- reg.opt.cut.func(lasso1, logit.valid1)
opt.cut.plot(lasso1.out)
# cut off by kappa
lasso1.out$cutoff[which.max(lasso1.out$kappa.vec)]
lasso1.out$cutoff[which.min(lasso1.out$ssdiff.vec)]

Regularized Regression: Elastic Net 10 fold Cross Validation

set.seed(123)
enet1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet1$bestTune
##     alpha lambda
## 100     0    0.1
# best coefficient
enet1.model <- coef(enet1$finalModel, enet1$bestTune$lambda)
enet1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      -0.940884985
## duration         -0.198157468
## acousticness     -0.107972712
## danceability     -0.008980677
## energy           -0.260064046
## instrumentalness -0.827713363
## key1              0.046091952
## key2              0.008592692
## key3              0.349398913
## key4             -0.189817749
## key5              0.003945898
## key6             -0.027278685
## key7              0.091414861
## key8              0.090088954
## key9              0.089685627
## key10            -0.163351622
## key11            -0.024566167
## loudness          0.062462753
## speechiness       2.309752435
## tempo             0.001341193
## valence          -0.487758435

search for best cutoff with validation set

enet1.out <- reg.opt.cut.func(enet1, logit.valid1)
opt.cut.plot(enet1.out)

# cut off by kappa
enet1.out$cutoff[which.max(enet1.out$kappa.vec)]
## [1] 0.12
enet1.out$cutoff[which.min(enet1.out$ssdiff.vec)]
## [1] 0.11

create final model on train and validation

set.seed(123)
enet1 <- train(popular ~ ., data = all.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet1$bestTune
##     alpha lambda
## 100     0    0.1
# best coefficient
enet1.model <- coef(enet1$finalModel, enet1$bestTune$lambda)
enet1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      -1.031119627
## duration         -0.202122239
## acousticness     -0.103141402
## danceability      0.159103462
## energy           -0.270068249
## instrumentalness -0.817245806
## key1              0.026882836
## key2              0.007008190
## key3              0.266050136
## key4             -0.114394579
## key5             -0.025139220
## key6             -0.032287416
## key7              0.078237033
## key8              0.138366721
## key9              0.130311371
## key10            -0.214434940
## key11            -0.063822095
## loudness          0.064513136
## speechiness       2.177666732
## tempo             0.001583618
## valence          -0.474359450

predict and evaluate on the test set where cutoff is at 0.5

prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular), 
                positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.enet1),
## as.factor(logit.test1$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 988 113
##          1   0   0
##                                           
##                Accuracy : 0.8974          
##                  95% CI : (0.8779, 0.9147)
##     No Information Rate : 0.8974          
##     P-Value [Acc > NIR] : 0.525           
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.8974          
##              Prevalence : 0.1026          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.12 corresponding to optimal kappa

prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.12, 1, 0) # using cutoff = 0.12
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 703  60
##          1 285  53
##                                          
##                Accuracy : 0.6866         
##                  95% CI : (0.6583, 0.714)
##     No Information Rate : 0.8974         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.096          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.46903        
##             Specificity : 0.71154        
##          Pos Pred Value : 0.15680        
##          Neg Pred Value : 0.92136        
##              Prevalence : 0.10263        
##          Detection Rate : 0.04814        
##    Detection Prevalence : 0.30699        
##       Balanced Accuracy : 0.59028        
##                                          
##        'Positive' Class : 1              
## 

predict and evaluate on the test set where cutoff is at 0.11 corresponding to optimal balance of sensitivity and specificity

prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.11, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 557  36
##          1 431  77
##                                          
##                Accuracy : 0.5758         
##                  95% CI : (0.546, 0.6053)
##     No Information Rate : 0.8974         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0962         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.68142        
##             Specificity : 0.56377        
##          Pos Pred Value : 0.15157        
##          Neg Pred Value : 0.93929        
##              Prevalence : 0.10263        
##          Detection Rate : 0.06994        
##    Detection Prevalence : 0.46140        
##       Balanced Accuracy : 0.62259        
##                                          
##        'Positive' Class : 1              
##